if (!require("pacman"))
install.packages("pacman")
# use this line for installing/loading
pacman::p_load (tidyverse,
palmerpenguins )
devtools::install_github("tidyverse/dsbox")
library(here)HW 01
0 - Setup
1 - Road traffic accidents in Edinburgh
accidents <- read_csv(here("data","accidents.csv"))
glimpse(accidents)Rows: 768
Columns: 31
$ id <chr> "2018950000002", "2018950000006", "2018950000012", …
$ easting <dbl> 327174, 324874, 330500, 321890, 320120, 331752, 325…
$ northing <dbl> 670941, 672457, 671750, 671640, 669330, 667988, 674…
$ longitude <dbl> -3.167032, -3.204252, -3.114026, -3.251772, -3.2794…
$ latitude <dbl> 55.92600, 55.93926, 55.93376, 55.93145, 55.91041, 5…
$ police_force <dbl> 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95,…
$ severity <chr> "Slight", "Slight", "Slight", "Slight", "Slight", "…
$ vehicles <dbl> 1, 1, 2, 3, 2, 3, 1, 1, 1, 2, 2, 1, 1, 1, 2, 1, 2, …
$ casualties <dbl> 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, …
$ date <chr> "31/12/2018", "30/12/2018", "03/01/2018", "01/01/20…
$ day_of_week <chr> "Monday", "Sunday", "Wednesday", "Monday", "Thursda…
$ time <time> 14:59:00, 12:50:00, 14:34:00, 02:25:00, 09:00:00, …
$ district <dbl> 923, 923, 923, 923, 923, 923, 923, 923, 923, 923, 9…
$ highway <chr> "S12000036", "S12000036", "S12000036", "S12000036",…
$ first_road_class <chr> "Unclassified", "Unclassified", "A(M) road", "A(M) …
$ first_road_number <dbl> 0, 0, 6095, 71, 0, 720, 0, 0, 1, 700, 0, 0, 0, 0, 0…
$ road_type <chr> "Single carriageway", "Single carriageway", "Single…
$ speed_limit <dbl> 20, 20, 20, 30, 30, 70, 20, 30, 20, 20, 20, 20, 20,…
$ junction_detail <chr> "Other junction", "Other junction", "Crossroads", "…
$ junction_control <chr> "Give way or uncontrolled", "Give way or uncontroll…
$ second_road_class <chr> "Unclassified", "Missing / Out of range", "A-road",…
$ second_road_number <dbl> 0, -1, 6106, 0, 0, 720, 0, 0, 0, 700, 0, 0, 0, 0, 0…
$ ped_cross_human <chr> "None within 50 metres", "None within 50 metres", "…
$ ped_cross_physical <chr> "Pedestrian phase at traffic signal junction", "No …
$ light <chr> "Daylight", "Daylight", "Daylight", "Darkness - lig…
$ weather <chr> "Fine + no high winds", "Fine + no high winds", "Fi…
$ road_surface <chr> "Dry", "Dry", "Wet or damp", "Wet or damp", "Wet or…
$ special_condition <chr> "None", "None", "None", "None", "None", "None", "No…
$ hazard <chr> "None", "None", "None", "None", "None", "None", "No…
$ urban_rural <dbl> 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ police <chr> "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Y…
accidents <- accidents |>
mutate(day_type = case_when(
day_of_week %in% c("Saturday", "Sunday") ~ "Weekend",
day_of_week %in% c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday") ~ "Weekday",
TRUE ~ as.character(day_of_week)))
ggplot(accidents, aes(x = time, fill = severity)) +
geom_density(alpha=0.6) +
facet_wrap(~day_type, ncol=1) +
scale_fill_manual("Severity", values = c("#8E6C8A", "#5E9CA0", "#F4E04D")) +
labs(title = "Number of accidents throughout the day",
subtitle = "By day of week and severity",
x = "Time of day",
y = "Density") +
theme_minimal(base_size = 14)The figure illustrates the distribution of road traffic accidents in Edinburgh across different times of day, separated by weekday and weekend, and categorized by severity (Fatal, Serious, Slight).
On weekdays, the number of serious and slight accidents peaks between 16:00 and 18:00, likely corresponding to evening rush hours. Fatal accidents show a smaller peak around midday (12:00).
On weekends, the number of slight accidents also peaks around 16:00, but serious accidents tend to peak slightly later, between 18:00 and 20:00. Notably, there are no fatal accidents reported on weekends in this dataset.
2 - NYC marathon winners
nyc_marathon <- read_csv(here("data","nyc_marathon.csv"))
glimpse(nyc_marathon)Rows: 102
Columns: 7
$ year <dbl> 1970, 1970, 1971, 1971, 1972, 1972, 1973, 1973, 1974, 1974, 1…
$ name <chr> "Gary Muhrcke", NA, "Norman Higgins", "Beth Bonner", "Sheldon…
$ country <chr> "United States", NA, "United States", "United States", "Unite…
$ time <time> 02:31:38, NA, 02:22:54, 02:55:22, 02:27:52, 03:08:41, …
$ time_hrs <dbl> 2.527222, NA, 2.381667, 2.922778, 2.464444, 3.144722, 2.36500…
$ division <chr> "Men", "Women", "Men", "Women", "Men", "Women", "Men", "Women…
$ note <chr> "Course record", "No woman finishers", "Course record", "Worl…
nyc_marathon_clean <- nyc_marathon |>
filter(!is.na(time_hrs))
ggplot(nyc_marathon_clean, aes(x = time_hrs))+
geom_histogram(binwidth = 0.0625)+
labs(x = "Finishing Time (hours)",
y = "Count" )+
theme_minimal(base_size = 14)ggplot(nyc_marathon_clean, aes(y = time_hrs))+
geom_boxplot()+
labs(y = "Finishing Time (hours)" )+
theme_minimal(base_size = 14)The histogram shows the overall distribution of finishing times for all marathon runners, making it easier to observe the frequency of different time intervals. In contrast, the box plot summarizes key statistics such as the median, interquartile range, and potential outliers.
ggplot(nyc_marathon_clean, aes(x = division,y = time_hrs, fill = division))+
geom_boxplot()+
scale_fill_manual(values = c("Blue","Red"))+
labs(x = "Gender",
y = "Finishing Time (hours)" )+
theme_minimal(base_size = 14)This figure compares the distribution of marathon finishing times between male and female runners. It clearly shows that female runners tend to have longer finishing times than male runners.
ggplot(nyc_marathon_clean, aes(x = division,y = time_hrs, fill = division))+
geom_boxplot(show.legend = FALSE)+
scale_fill_manual(values = c("Blue","Red"))+
labs(x = "Gender",
y = "Finishing Time (hours)" )+
theme_minimal(base_size = 14)Since the gender information is already clearly labeled on the x-axis, the color legend becomes redundant. Removing the legend increases the data-to-ink ratio by reducing unnecessary visual elements.
ggplot(nyc_marathon_clean, aes(x = year,y = time_hrs, color = division,shap = division))+
geom_point(stat = "summary", fun = "mean")+
scale_color_manual(values = c("Blue","Red")) +
scale_shape_manual(values = c("5","7")) +
labs(x = "year",
y = "Finishing Time (hours)",
color = "Gender",
shape = "Gender")+
theme_minimal(base_size = 14)The plot shows how marathon finishing times have changed over the past decades. It reveals a notable downward trend in finishing times for both men and women. Additionally, men consistently finished faster than women throughout the recorded years.
3 - US counties
“geom_point(aes(x = median_edu, y = median_hh_income))”
This line creates a scatter plot to show the relationship between median years of education and median household income across U.S. counties.
“geom_boxplot(aes(x = smoking_ban, y = pop2017))”
This line creates a boxplot showing the distribution of 2017 population sizes grouped by smoking ban status.
However, combining these two plots doesn’t make sense because they use different x-axes: one is a continuous variable (median_edu) and the other is a categorical variable (smoking_ban).
The second plot makes it much easier to compare poverty levels across people from different median education levels. This is because all four groups share the same y-axis scale (poverty), which facilitates direct visual comparison.
This suggests that when your goal is to compare values on the y-axis, you should place the faceting variable across columns to keep the y-axis consistent. Conversely, if you are focusing on comparisons along the x-axis, it’s better to facet across rows.
library(openintro)
glimpse(county) Rows: 3,142
Columns: 15
$ name <chr> "Autauga County", "Baldwin County", "Barbour County"…
$ state <fct> Alabama, Alabama, Alabama, Alabama, Alabama, Alabama…
$ pop2000 <dbl> 43671, 140415, 29038, 20826, 51024, 11714, 21399, 11…
$ pop2010 <dbl> 54571, 182265, 27457, 22915, 57322, 10914, 20947, 11…
$ pop2017 <int> 55504, 212628, 25270, 22668, 58013, 10309, 19825, 11…
$ pop_change <dbl> 1.48, 9.19, -6.22, 0.73, 0.68, -2.28, -2.69, -1.51, …
$ poverty <dbl> 13.7, 11.8, 27.2, 15.2, 15.6, 28.5, 24.4, 18.6, 18.8…
$ homeownership <dbl> 77.5, 76.7, 68.0, 82.9, 82.0, 76.9, 69.0, 70.7, 71.4…
$ multi_unit <dbl> 7.2, 22.6, 11.1, 6.6, 3.7, 9.9, 13.7, 14.3, 8.7, 4.3…
$ unemployment_rate <dbl> 3.86, 3.99, 5.90, 4.39, 4.02, 4.93, 5.49, 4.93, 4.08…
$ metro <fct> yes, yes, no, yes, yes, no, no, yes, no, no, yes, no…
$ median_edu <fct> some_college, some_college, hs_diploma, hs_diploma, …
$ per_capita_income <dbl> 27841.70, 27779.85, 17891.73, 20572.05, 21367.39, 15…
$ median_hh_income <int> 55317, 52562, 33368, 43404, 47412, 29655, 36326, 436…
$ smoking_ban <fct> none, none, partial, none, none, none, NA, NA, none,…
county_clean <- county |>
filter(!is.na(homeownership),!is.na(poverty))
ggplot(county_clean, aes(x = homeownership, y = poverty))+
geom_point()+
labs(title = "Plot A") ggplot(county_clean, aes(x = homeownership, y = poverty))+
geom_point()+
geom_smooth(method = "gam", se = FALSE, color = "Blue")+
labs(title = "Plot B") ggplot(county_clean, aes(x = homeownership, y = poverty, group = metro))+
geom_point()+
geom_smooth(method = "gam",se = FALSE, color = "Green")+
labs(title = "Plot C") ggplot(county_clean, aes(x = homeownership, y = poverty, group = metro))+
geom_smooth(method = "gam", se = FALSE, color = "Blue")+
geom_point()+
labs(title = "Plot D") ggplot(county_clean, aes(x = homeownership, y = poverty, group = metro))+
geom_point(aes(color = metro))+
geom_smooth(method = "gam", se = FALSE, color = "Blue", aes(linetype = metro))+
guides(color = guide_legend(order = 2),
linetype = guide_legend(order = 1)) +
labs(title = "Plot E") ggplot(county_clean, aes(x = homeownership, y = poverty, group = metro, color = metro))+
geom_point()+
geom_smooth(method = "gam", se = FALSE, )+
labs(title = "Plot F") ggplot(county_clean, aes(x = homeownership, y = poverty))+
geom_point(aes(group = metro, color = metro))+
geom_smooth(method = "gam", se = FALSE, color = "Blue")+
labs(title = "Plot G") ggplot(county_clean, aes(x = homeownership, y = poverty))+
geom_point(aes(group = metro, color = metro))+
labs(title = "Plot H") 4 - Rental apartments in SF
credit <- read_csv(here("data","credit.csv"))
glimpse(credit)Rows: 400
Columns: 5
$ balance <dbl> 333, 903, 580, 964, 331, 1151, 203, 872, 279, 1350, 1407, 0, 2…
$ income <dbl> 14.891, 106.025, 104.593, 148.924, 55.882, 80.180, 20.996, 71.…
$ student <chr> "No", "Yes", "No", "No", "No", "No", "No", "No", "No", "Yes", …
$ married <chr> "Yes", "Yes", "No", "No", "Yes", "No", "No", "No", "No", "Yes"…
$ limit <dbl> 3606, 6645, 7075, 9504, 4897, 8047, 3388, 7114, 3300, 6819, 81…
library(scales)
ggplot(credit, aes(x = income,y = balance, color = student, shape = student ))+
geom_point(alpha = 0.6,show.legend = FALSE)+
geom_smooth(method = "lm", se = FALSE, show.legend = FALSE)+
labs(x = "Income", y = "Credit card balance")+
facet_grid(student ~ married, labeller = label_both)+
scale_x_continuous(labels = dollar_format(prefix = "$", suffix = "K"))+
scale_y_continuous(labels = dollar_format(prefix = "$"))+
theme_minimal(base_size = 14) +
theme( strip.background = element_rect(fill = "grey90"),
panel.border = element_rect(color = "Black", fill = NA))This figure illustrates the relationship between income and credit card balance, segmented by student status and marital status. Overall, there is a positive correlation between income and credit card balance. Compared to non-students, students tend to carry higher credit card balances at similar income levels. Among students, married individuals generally have lower balances than their unmarried peers with comparable incomes.
I believe that marital status and student status are useful predictors, in addition to income, for predicting credit card balance. Students tend to have higher credit card balances than non-students at similar income levels. Additionally, married individuals generally have lower balances than unmarried individuals, which suggests a potential interaction effect between marital status and student status.
library(scales)
credit <- credit|>
mutate(credit_utilization = balance/limit)
ggplot(credit, aes(x = income,y = credit_utilization, color = student, shape = student ))+
geom_point(alpha = 0.6,show.legend = FALSE)+
geom_smooth(method = "lm", se = FALSE, show.legend = FALSE)+
labs(x = "Income", y = "Credit utilization")+
facet_grid(student ~ married, labeller = label_both)+
scale_x_continuous(labels = dollar_format(prefix = "$", suffix = "K"))+
scale_y_continuous(labels = label_percent(suffix = "%"))+
theme_minimal(base_size = 14) +
theme( strip.background = element_rect(fill = "grey90"),
panel.border = element_rect(color = "Black", fill = NA))The relationships between income and credit utilization differ from those between income and credit card balance. Among non-students, income and credit utilization show a positive correlation, consistent with the pattern observed for credit card balance. However, for student groups, the correlation between income and credit utilization appears to follow an opposite trend compared to the relationship with credit card balance. Additionally, married individuals generally have lower credit utilization than unmarried individuals at similar income levels.
5 - Napoleon’s march.
napoleon <- read_rds(here("data","napoleon.rds"))
cities <- napoleon$cities
temps <- napoleon$temperatures
troops <- napoleon$troops
glimpse(cities)Rows: 20
Columns: 3
$ long <dbl> 24.0, 25.3, 26.4, 26.8, 27.7, 27.6, 28.5, 28.7, 29.2, 30.2, 30.4,…
$ lat <dbl> 55.0, 54.7, 54.4, 54.3, 55.2, 53.9, 54.3, 55.5, 54.4, 55.3, 54.5,…
$ city <chr> "Kowno", "Wilna", "Smorgoni", "Moiodexno", "Gloubokoe", "Minsk", …
glimpse(temps)Rows: 9
Columns: 5
$ long <dbl> 37.6, 36.0, 33.2, 32.0, 29.2, 28.5, 27.2, 26.7, 25.3
$ temp <dbl> 0, 0, -9, -21, -11, -20, -24, -30, -26
$ month <chr> "Oct", "Oct", "Nov", "Nov", "Nov", "Nov", "Dec", "Dec", "Dec"
$ day <dbl> 18, 24, 9, 14, 24, 28, 1, 6, 7
$ date <date> 1812-10-18, 1812-10-24, 1812-11-09, 1812-11-14, 1812-11-24, 1812…
glimpse(troops)Rows: 51
Columns: 5
$ long <dbl> 24.0, 24.5, 25.5, 26.0, 27.0, 28.0, 28.5, 29.0, 30.0, 30.3, …
$ lat <dbl> 54.9, 55.0, 54.5, 54.7, 54.8, 54.9, 55.0, 55.1, 55.2, 55.3, …
$ survivors <dbl> 340000, 340000, 340000, 320000, 300000, 280000, 240000, 2100…
$ direction <chr> "advancing", "advancing", "advancing", "advancing", "advanci…
$ group <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
library(ggrepel)
library(cowplot)
library(grid)
temps <- temps %>%
mutate(nice.label = paste0(temp, "°C"))
march.1812.plot.simple <- ggplot() +
geom_path(data = troops, aes(x = long, y = lat, group = group,
color = direction, size = survivors),
lineend = "round") +
geom_point(data = cities, aes(x = long, y = lat),
color = "#DC5B44") +
geom_text_repel(data = cities, aes(x = long, y = lat, label = city),
color = "#DC5B44", family = "sans") +
scale_size(range = c(0.5, 10)) +
scale_colour_manual(values = c("advancing" = "#DFC17E", "retreating" = "#252523")) +
guides(color = "none", size = "none") +
theme_void()
temps.1812.plot <- ggplot(data = temps, aes(x = long, y = temp)) +
geom_line() +
geom_label(aes(label = nice.label),
family = "sans", size = 2.5) +
labs(x = NULL, y = "° Celsius") +
scale_x_continuous(limits = ggplot_build(march.1812.plot.simple)$layout$panel_params[[1]]$x.range) +
scale_y_continuous(position = "right") +
coord_cartesian(ylim = c(-35, 5)) +
theme_bw(base_family = "sans") +
theme(panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_blank(),
axis.text.x = element_blank(), axis.ticks = element_blank(),
panel.border = element_blank())
both.1812.plot.simple <- rbind(ggplotGrob(march.1812.plot.simple),
ggplotGrob(temps.1812.plot))
panels <- both.1812.plot.simple$layout$t[grep("panel", both.1812.plot.simple$layout$name)]
both.1812.plot.simple$heights[panels] <- unit(c(3, 1), "null")
grid.newpage()
grid.draw(both.1812.plot.simple)I adapted and reproduced the Napoleon’s March plot using code from the following blog post:
Andrew Heiss – Exploring Minard’s 1812 Plot with ggplot2
https://www.andrewheiss.com/blog/2017/08/10/exploring-minards-1812-plot-with-ggplot2/
This resource provided a solid foundation for understanding how to structure the plot using ggplot2. I made several modifications for clarity and styling purposes.
1. Used read_rds() and here() to locate and read the data file.
2. Used glimpse() to examine the structure of all three datasets.
3. Loaded the required libraries: ggrepel, cowplot, and grid.
4. Added a new column nice.label to the temps dataset to append a degree symbol to the temperature values.
5. Created the troop movement plot.
6. Used geom_path() to draw troop movement paths. The x-axis represents longitude and the y-axis represents latitude. Line color indicates direction (advancing or retreating), and line width reflects the number of survivors.
7.Used geom_point() to add city locations.
8. Used geom_text_repel() to label cities with non-overlapping text.
9. Used scale_size() to control minimum and maximum line thickness.
- Used scale_colour_manual() to assign colors for advancing and retreating directions.
- Created the temperature plot.
- Used geom_line() to draw a line connecting temperature points based on longitude.
- Used geom_label() to annotate temperature values using the nice.label column.
- Used labs() to set the y-axis label.
- Used scale_x_continuous() and scale_y_continuous() to align the axes and move the y-axis to the right.
16. Used coord_cartesian() to set the y-axis viewing range between -35 and 5 degrees Celsius.
- Used theme_bw() and additional theme() settings to remove grid lines, axis ticks, labels, and panel borders for a clean appearance.
18. Combined the two plots using ggplotGrob() and rbind(), and adjusted their relative heights with unit(c(3, 1), “null”).
library(ggrepel)
library(cowplot)
library(grid)
temps <- temps %>%
mutate(nice.label = paste0(temp, "°C"))
march.1812.plot.simple <- ggplot() +
geom_path(data = troops, aes(x = long, y = lat, group = group,
color = direction, size = survivors),
lineend = "round") +
geom_point(data = cities, aes(x = long, y = lat),
color = "#5E9CA0") +
geom_text_repel(data = cities, aes(x = long, y = lat, label = city),
color = "#5E9CA0", family = "sans") +
scale_size(range = c(0.5, 10)) +
scale_colour_manual(values = c("advancing" = "#DFA18E", "retreating" = "#254523")) +
guides(color = "none", size = "none") +
labs(title = "Napoleon’s march in ggplot2")+
theme_void()
temps.1812.plot <- ggplot(data = temps, aes(x = long, y = temp)) +
geom_line() +
geom_label(aes(label = nice.label),
family = "sans", size = 2.5, fill = "lightgreen") +
labs(x = NULL, y = "° Celsius") +
scale_x_continuous(limits = ggplot_build(march.1812.plot.simple)$layout$panel_params[[1]]$x.range) +
scale_y_continuous(position = "right") +
coord_cartesian(ylim = c(-35, 5)) +
theme_bw(base_family = "sans") +
theme(panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_blank(),
axis.text.x = element_blank(), axis.ticks = element_blank(),
panel.border = element_blank())
both.1812.plot.simple <- rbind(ggplotGrob(march.1812.plot.simple),
ggplotGrob(temps.1812.plot))
panels <- both.1812.plot.simple$layout$t[grep("panel", both.1812.plot.simple$layout$name)]
both.1812.plot.simple$heights[panels] <- unit(c(3, 1), "null")
grid.newpage()
grid.draw(both.1812.plot.simple)